import numpy as np
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# To split the data into Train and Test Data
from sklearn.model_selection import train_test_split
# To calculate the accuracy score of the model
from sklearn.metrics import average_precision_score, confusion_matrix, accuracy_score, classification_report, plot_confusion_matrix
# To Scale the Data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
# To One-Hot Encode the Categorical variables
from sklearn.preprocessing import OneHotEncoder
# Loading the data
df = pd.read_csv('/content/Data - Parkinsons.csv')
df.head(3)
#Shape of the Data
df.shape
#Information of the Data - Type of columns
df.info()
df.describe()
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Fo(Hz)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Fo(Hz)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Fhi(Hz)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Fhi(Hz)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Flo(Hz)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Flo(Hz)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Jitter(%)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Jitter(%)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Jitter(Abs)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Jitter(Abs)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:RAP', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:RAP'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:PPQ', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:PPQ'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='Jitter:DDP', kde = True)
# Outliers testing
sns.boxplot(df['Jitter:DDP'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Shimmer', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Shimmer'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:Shimmer(dB)', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:Shimmer(dB)'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='Shimmer:APQ3', kde = True)
# Outliers testing
sns.boxplot(df['Shimmer:APQ3'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='Shimmer:APQ5', kde = True)
# Outliers testing
sns.boxplot(df['Shimmer:APQ5'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='MDVP:APQ', kde = True)
# Outliers testing
sns.boxplot(df['MDVP:APQ'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='Shimmer:DDA', kde = True)
# Outliers testing
sns.boxplot(df['Shimmer:DDA'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='NHR', kde = True)
# Outliers testing
sns.boxplot(df['NHR'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='HNR', kde = True)
# Outliers testing
sns.boxplot(df['HNR'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='RPDE', kde = True)
# Outliers testing
sns.boxplot(df['RPDE'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='DFA', kde = True)
# Outliers testing
sns.boxplot(df['DFA'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='spread1', kde = True)
# Outliers testing
sns.boxplot(df['spread1'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='spread2', kde = True)
# Outliers testing
sns.boxplot(df['spread2'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='D2', kde = True)
# Outliers testing
sns.boxplot(df['D2'], palette="Set3")
# Let's see the distribution of the data
sns.displot(df, x='PPE', kde = True)
# Outliers testing
sns.boxplot(df['PPE'], palette="Set3")
# Distribution Target Column
sns.countplot(df['status'])
# Pairplot of all the features
sns.pairplot(df)
Modelling Process - Before the modelling process, we will do all the feature scaling as the distribution of the feature varies
Correlation
plt.figure(figsize=(16,5))
sns.heatmap(df.corr('pearson'), annot = True, fmt = '.1g')
x = df.drop(['name', 'status'], axis = 1)
y = df[['status']]
# Standard Scaling all the columns
X_std = pd.DataFrame(StandardScaler().fit_transform(x))
X_std.columns = x.columns
x_train, x_test, y_train, y_test = train_test_split(X_std, y, random_state = 42, stratify = y, test_size = 0.3)
Logistic Regression
from sklearn.linear_model import LogisticRegression
model_lr = LogisticRegression(random_state = 0)
# Fit the Logistic Regression Model on data
model_lr.fit(x_train, y_train)
# Predict price range of test data
y_pred_train_lr = model.predict(x_train)
y_pred_test_lr = model.predict(x_test)
# Let's measure the accuracy of this model's prediction
accuracy_score(y_test, y_pred_test_lr)
# And some other metrics
print(classification_report(y_test, y_pred_test_lr, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_lr)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True,cmap='Blues', fmt='g')
KNN
from sklearn.neighbors import KNeighborsClassifier
# instantiate learning model (k = 3)
knn3 = KNeighborsClassifier(n_neighbors = 3)
# fitting the model
knn3.fit(x_train, y_train)
# predict the response
y_pred = knn3.predict(x_test)
# evaluate accuracy
print(accuracy_score(y_test, y_pred))
# instantiate learning model (k = 5)
knn5 = KNeighborsClassifier(n_neighbors=5)
# fitting the model
knn5.fit(x_train, y_train)
# predict the response
y_pred = knn5.predict(x_test)
# evaluate accuracy
print(accuracy_score(y_test, y_pred))
# instantiate learning model (k = 9)
knn7 = KNeighborsClassifier(n_neighbors=7)
# fitting the model
knn7.fit(x_train, y_train)
# predict the response
y_pred = knn7.predict(x_test)
# evaluate accuracy
print(accuracy_score(y_test, y_pred))
y_pred_test_knn = knn3.predict(x_test)
y_pred_train_knn = knn3.predict(x_train)
# Accuracy of Test Data
accuracy_score(y_test, y_pred_test_knn)
# And some other metrics
print(classification_report(y_test, y_pred_test_knn, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_knn)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True,cmap='Blues', fmt='g')
Naive Bayes
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes
# create the model
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
y_pred_train_nb = nb_model.predict(x_train)
y_pred_test_nb = nb_model.predict(x_test)
# Let's measure the accuracy of this model's prediction
accuracy_score(y_test, y_pred_test_nb)
# And some other metrics
print(classification_report(y_test, y_pred_test_nb, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_nb)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True, cmap='Blues', fmt='g')
from mlxtend.classifier import StackingClassifier
# Initializing Support Vector classifier
svc = SVC()
# Initializing the StackingCV classifier
sclf = StackingClassifier(classifiers = [nb_model, knn3, model_lr], meta_classifier = svc)
sclf.fit(x_train, y_train)
y_pred_test_meta = sclf.predict(x_test)
# Let's measure the accuracy of this model's prediction
accuracy_score(y_test, y_pred_test_meta)
# And some other metrics
print(classification_report(y_test, y_pred_test_meta, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_meta)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True, cmap='Blues', fmt='g')
Bagging Classifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bgcl = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=50,random_state=1)
bgcl = bgcl.fit(x_train, y_train)
y_pred_test_bagging = bgcl.predict(x_test)
# And some other metrics
print(classification_report(y_test, y_pred_test_bagging, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_bagging)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True, cmap='Blues', fmt='g')
XGBoost Classifier
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(x_train, y_train)
y_pred_test_xgb = xgb.predict(x_test)
# And some other metrics
print(classification_report(y_test, y_pred_test_xgb, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_xgb)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True, cmap='Blues', fmt='g')
Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50, random_state=1)
rfcl = rfcl.fit(x_train, y_train)
y_pred_test_rf = rfcl.predict(x_test)
# And some other metrics
print(classification_report(y_test, y_pred_test_rf, digits=2))
conf_mat = confusion_matrix(y_test, y_pred_test_rf)
df_conf_mat = pd.DataFrame(conf_mat)
plt.figure(figsize = (6,4))
sns.heatmap(df_conf_mat, annot=True, cmap='Blues', fmt='g')
Conclusion -